import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import os
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
if not os.path.exists("figures"):
os.makedirs("figures")
theme_colors = px.colors.qualitative.G10
template = "plotly_white"df = pd.read_csv("data/clean_lightcast_job_postings.csv")
print("Dataset shape:", df.shape)
print("\nSalary statistics:")
print(df['SALARY'].describe())Dataset shape: (69198, 109)
Salary statistics:
count 69198.000000
mean 116806.493352
std 29426.450593
min 15860.000000
25% 116300.000000
50% 116300.000000
75% 116300.000000
max 500000.000000
Name: SALARY, dtype: float64
employment_counts = df['EMPLOYMENT_TYPE_NAME'].value_counts().reset_index()
employment_counts.columns = ['Employment Type', 'Count']
figa = px.bar(employment_counts, x='Employment Type', y='Count',
title='Job Postings by Employment Type',
color='Employment Type', color_discrete_sequence=theme_colors,
template=template)
figa.update_layout(
xaxis_title="Employment Type",
yaxis_title="Number of Postings",
legend_title="Employment Type",
font=dict(family="Arial, sans-serif", size=12)
)
figa.write_html("figures/job_postings_by_employment_type.html")
figa.write_image("figures/job_postings_by_employment_type.png")
figa.show()salary_df = df[df['SALARY'].notna() & (df['SALARY'] > 0) & (df['SALARY'] < 500000)]
figb = px.box(salary_df, x="CITY_NAME", y="SALARY",
title="Salary Distribution by City",
color_discrete_sequence=theme_colors,
template=template)
# Show only top 10 cities by median salary to make the visualization clearer
top_cities = salary_df.groupby('CITY_NAME')['SALARY'].median().nlargest(10).index.tolist()
figb = px.box(salary_df[salary_df['CITY_NAME'].isin(top_cities)],
x="CITY_NAME", y="SALARY",
title="Salary Distribution by Top 10 Cities",
color="CITY_NAME", color_discrete_sequence=theme_colors,
template=template)
figb.update_layout(
xaxis_title="City",
yaxis_title="Salary ($)",
showlegend=False,
font=dict(family="Arial, sans-serif", size=12)
)
figb.write_html("figures/salary_distribution_by_city.html")
figb.write_image("figures/salary_distribution_by_city.png")
figb.show()remote_counts = df['REMOTE_TYPE_NAME'].value_counts().reset_index()
remote_counts.columns = ['Remote Type', 'Count']
figc = px.pie(remote_counts, names='Remote Type', values='Count',
title='Distribution of Remote vs. On-Site Jobs',
color='Remote Type', color_discrete_sequence=theme_colors,
template=template)
figc.update_layout(
legend_title="Remote Status",
font=dict(family="Arial, sans-serif", size=12)
)
figc.write_html("figures/remote_vs_onsite_distribution.html")
figc.write_image("figures/remote_vs_onsite_distribution.png")
figc.show()# Convert date columns to datetime
date_columns = ['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'MODELED_EXPIRED']
for col in date_columns:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors='coerce')
df['POSTING_MONTH'] = df['POSTED'].dt.to_period('M').astype(str)# Create a time-based analysis of job postings
monthly_postings = df.groupby('POSTING_MONTH').size().reset_index(name='count')
monthly_postings['POSTING_MONTH'] = pd.to_datetime(monthly_postings['POSTING_MONTH'])
monthly_postings = monthly_postings.sort_values('POSTING_MONTH')
monthly_postings['POSTING_MONTH'] = monthly_postings['POSTING_MONTH'].dt.strftime('%Y-%m')
fig1 = px.line(monthly_postings, x='POSTING_MONTH', y='count',
title='Monthly Job Posting Trends',
labels={'count': 'Number of Postings', 'POSTING_MONTH': 'Month'},
template=template)
fig1.update_layout(
xaxis_title="Month",
yaxis_title="Number of Job Postings",
legend_title="Legend",
font=dict(family="Arial, sans-serif", size=12)
)
fig1.write_html("figures/monthly_job_posting_trends.html")
fig1.write_image("figures/monthly_job_posting_trends.png")
fig1.show()# Employment Type Analysis
employment_counts = df['EMPLOYMENT_TYPE_NAME'].value_counts().reset_index()
employment_counts.columns = ['Employment Type', 'Count']
fig2 = px.bar(employment_counts, x='Employment Type', y='Count',
title='Job Postings by Employment Type',
color='Employment Type', color_discrete_sequence=theme_colors,
template=template)
fig2.update_layout(
xaxis_title="Employment Type",
yaxis_title="Number of Postings",
legend_title="Employment Type",
font=dict(family="Arial, sans-serif", size=12)
)
fig2.write_html("figures/job_postings_by_employment_type.html")
fig2.write_image("figures/job_postings_by_employment_type.png")
fig2.show()
# Remote Work Analysis with nicer formatting
remote_counts = df['REMOTE_TYPE_NAME'].value_counts().reset_index()
remote_counts.columns = ['Remote Type', 'Count']
fig3 = px.pie(remote_counts, names='Remote Type', values='Count',
title='Distribution of Remote vs. On-Site Jobs',
color='Remote Type', color_discrete_sequence=theme_colors,
template=template)
fig3.update_layout(
legend_title="Remote Status",
font=dict(family="Arial, sans-serif", size=12)
)
fig3.write_html("figures/remote_vs_onsite_distribution.html")
fig3.write_image("figures/remote_vs_onsite_distribution.png")
fig3.show()# Salary Analysis
salary_df = df[df['SALARY'].notna() & (df['SALARY'] > 0)]
# Top cities by average salary (with at least 50 job postings)
city_salary = salary_df.groupby('CITY_NAME').agg(
avg_salary=('SALARY', 'mean'),
job_count=('SALARY', 'count')
).reset_index()
city_salary_filtered = city_salary[city_salary['job_count'] >= 50].sort_values('avg_salary', ascending=False).head(10)
fig4 = px.bar(city_salary_filtered, x='CITY_NAME', y='avg_salary',
title='Top 10 Cities by Average Salary (Min. 50 Postings)',
text='job_count', color='avg_salary', color_continuous_scale='Viridis',
template=template)
fig4.update_traces(texttemplate='%{text} jobs', textposition='outside')
fig4.update_layout(
xaxis_title="City",
yaxis_title="Average Salary ($)",
coloraxis_showscale=False,
font=dict(family="Arial, sans-serif", size=12)
)
fig4.write_html("figures/top_cities_by_avg_salary.html")
fig4.write_image("figures/top_cities_by_avg_salary.png")
fig4.show()# Salary distribution by state
state_salary = salary_df.groupby('STATE_NAME').agg(
avg_salary=('SALARY', 'mean'),
median_salary=('SALARY', 'median'),
job_count=('SALARY', 'count')
).reset_index()
state_salary_filtered = state_salary[state_salary['job_count'] >= 100].sort_values('avg_salary', ascending=False)
fig5 = px.bar(state_salary_filtered, x='STATE_NAME', y='avg_salary',
title='Average Salary by State (Min. 100 Postings)',
color='job_count', text='median_salary',
labels={'job_count': 'Number of Job Postings'},
color_continuous_scale='Viridis',
template=template)
fig5.update_traces(texttemplate='$%{text:.0f} median', textposition='outside')
fig5.update_layout(
xaxis_title="State",
yaxis_title="Average Salary ($)",
coloraxis_colorbar_title="Job Count",
font=dict(family="Arial, sans-serif", size=12)
)
fig5.write_html("figures/avg_salary_by_state.html")
fig5.write_image("figures/avg_salary_by_state.png")
fig5.show()# Experience requirements analysis
exp_df = df[df['MIN_YEARS_EXPERIENCE'].notna()]
fig6 = px.histogram(exp_df, x='MIN_YEARS_EXPERIENCE',
title='Distribution of Minimum Experience Requirements',
color_discrete_sequence=theme_colors,
template=template)
fig6.update_layout(
xaxis_title="Minimum Years of Experience",
yaxis_title="Number of Job Postings",
bargap=0.1,
font=dict(family="Arial, sans-serif", size=12)
)
fig6.write_html("figures/min_experience_distribution.html")
fig6.write_image("figures/min_experience_distribution.png")
fig6.show()# Education level requirements vs. Salary
edu_salary = df[df['SALARY'].notna() & (df['MIN_EDULEVELS_NAME'].notna())].groupby('MIN_EDULEVELS_NAME').agg(
avg_salary=('SALARY', 'mean'),
median_salary=('SALARY', 'median'),
job_count=('SALARY', 'count')
).reset_index()
edu_salary = edu_salary[edu_salary['job_count'] >= 30].sort_values('avg_salary')
fig7 = px.bar(edu_salary, x='MIN_EDULEVELS_NAME', y='avg_salary',
title='Average Salary by Minimum Education Level',
color='job_count', text='median_salary',
labels={'job_count': 'Number of Job Postings', 'MIN_EDULEVELS_NAME': 'Education Level'},
color_continuous_scale='Viridis',
template=template)
fig7.update_traces(texttemplate='$%{text:.0f} median', textposition='outside')
fig7.update_layout(
xaxis_title="Minimum Education Level",
yaxis_title="Average Salary ($)",
coloraxis_colorbar_title="Job Count",
font=dict(family="Arial, sans-serif", size=12)
)
fig7.write_html("figures/salary_by_education.html")
fig7.write_image("figures/salary_by_education.png")
fig7.show()
# Industry analysis
if 'NAICS_2022_2_NAME' in df.columns:
industry_field = 'NAICS_2022_2_NAME'
else:
industry_field = 'NAICS2_NAME'
industry_counts = df[df[industry_field].notna()][industry_field].value_counts().reset_index()
industry_counts.columns = ['Industry', 'Count']
industry_counts = industry_counts.head(10) # Top 10 industries
fig8 = px.bar(industry_counts, x='Count', y='Industry',
title='Top 10 Industries by Job Posting Count',
orientation='h', color='Count', color_continuous_scale='Viridis',
template=template)
fig8.update_layout(
xaxis_title="Number of Job Postings",
yaxis_title="Industry",
font=dict(family="Arial, sans-serif", size=12),
yaxis={'categoryorder':'total ascending'}
)
fig8.write_html("figures/top_industries.html")
fig8.write_image("figures/top_industries.png")
fig8.show()
# Salary Box Plot by Remote Type
fig9 = px.box(salary_df, x='REMOTE_TYPE_NAME', y='SALARY',
title='Salary Distribution by Remote Work Type',
color='REMOTE_TYPE_NAME', color_discrete_sequence=theme_colors,
template=template)
fig9.update_layout(
xaxis_title="Remote Work Type",
yaxis_title="Salary ($)",
showlegend=False,
font=dict(family="Arial, sans-serif", size=12)
)
fig9.write_html("figures/salary_by_remote_type.html")
fig9.write_image("figures/salary_by_remote_type.png")
fig9.show()# Create a combined visualization: Salary distribution by employment type with average markers
fig10 = px.box(salary_df, x='EMPLOYMENT_TYPE_NAME', y='SALARY',
title='Salary Distribution by Employment Type',
color='EMPLOYMENT_TYPE_NAME', color_discrete_sequence=theme_colors,
template=template)
emp_salary_avg = salary_df.groupby('EMPLOYMENT_TYPE_NAME')['SALARY'].mean().reset_index()
for i, emp_type in enumerate(emp_salary_avg['EMPLOYMENT_TYPE_NAME']):
avg_salary = emp_salary_avg.loc[emp_salary_avg['EMPLOYMENT_TYPE_NAME'] == emp_type, 'SALARY'].values[0]
fig10.add_trace(
go.Scatter(
x=[emp_type],
y=[avg_salary],
mode='markers',
marker=dict(color='red', size=10, symbol='star'),
name=f'Average: ${avg_salary:.0f}'
)
)
fig10.update_layout(
xaxis_title="Employment Type",
yaxis_title="Salary ($)",
showlegend=False,
font=dict(family="Arial, sans-serif", size=12)
)
fig10.write_html("figures/salary_by_employment_type.html")
fig10.write_image("figures/salary_by_employment_type.png")
fig10.show()